---
title: "Application"
author: "Neal Shin"
date: "7/25/2024"
output: html_document
---

```{r setup, include=FALSE}
rm(list = ls())
cat("\f")
options(scipen = 10)

library(MASS)
library(haven)
library(Matrix)
library(lfe)
library(ggplot2)
library(RColorBrewer)
library(Hotelling)
library(did)

source("functions.R")
LutzData <- read_dta("ros_ccd_aejp.dta")
```

### data cleaning ###

```{r data cleaning}
### I followed Lutz (2011)
DataCleaning <- list(start_year=88,
                     end_year=107,
                     district_size=10000,
                     T0=11
)

### make Boston, Detroit and Fort Worth into non-dismissal districts
LutzData$cdum[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                LutzData$leaid == 4819700] <- 0
LutzData$cdum_ever[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                     LutzData$leaid == 4819700] <- 0
LutzData$cdum_time[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                     LutzData$leaid == 4819700] <- 0
LutzData$underc[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                  LutzData$leaid == 4819700] <- 0

### multiply dissimilarity index by 100
LutzData$disd <- LutzData$disd*100

### following Lutz
# drop never-court-mandated school districts
# underc: under court order as of 1991
# ros: from the Rossell and Armor sample
# a_raceflag: insufficient race data
# totbase: n of students in 1988
### drop Holmes county consolidated school district
### for missing dissimilarity index: 2801980
LutzData <- LutzData[LutzData$ros==1 & LutzData$underc==1 & 
                       is.na(LutzData$underc)==FALSE &
                       LutzData$a_raceflag!=1 & 
                       LutzData$totbase>DataCleaning$district_size &
                       LutzData$leaid!=2801980,]

# a: first year of panel
# b: last year of panel
LutzData <- LutzData[LutzData$year >= DataCleaning$start_year & 
                       LutzData$year <= DataCleaning$end_year,]

### set arbitrarily large treatment timing for never-treated units
LutzData$cdum_time[LutzData$cdum_ever==0] <- 1000

### keep units that are observed for every period
LutzData <- 
  LutzData[sapply(LutzData$leaid,  
                  function(x) 
                    nrow(LutzData[LutzData$leaid==x,])==
                    (DataCleaning$end_year-DataCleaning$start_year+1)),]

### rename index variables and recenter them 
### drop earlier-treated units
colnames(LutzData)[c(1,2,11,48)] <- c("time", "id","E","weight")
LutzData$time <- LutzData$time - DataCleaning$start_year 
LutzData$E <- LutzData$E - DataCleaning$start_year 
LutzData <- LutzData[LutzData$E>DataCleaning$T0,] 
temp <- unique(LutzData$id)
LutzData$id <- sapply(LutzData$id,function(x) which(x==temp))
remove(temp)

T <- DataCleaning$end_year - DataCleaning$start_year
N <- length(unique(LutzData$id))
LutzData$E[LutzData$E>T] <- T+1

colnames(LutzData)[c(41,50)] <- c("n_student", "p_freelunch")

### construct a first-differenced dataset for Kmeans ###
### the order is Y, id, time, treatment timing, weight, X
KmeansData <- LutzData[,c(3,2,1,11,48,
                          49,54,56,41,50)]
KmeansData$weight <- 1

### take first differences
KmeansData$diff_disd <- 0
for (i in 1:N){
  for (t in 1:T){
    KmeansData$diff_disd[(KmeansData$id==i)*(KmeansData$time==t)==1] <- 
      KmeansData$disd[(KmeansData$id==i)*(KmeansData$time==t)==1] - 
      KmeansData$disd[(KmeansData$id==i)*(KmeansData$time==(t-1))==1]
  }
}
remove(i,t)
```

### cross-Validation ###

```{r cross validation}
### cross-validation for regularized time fixed-effects
CrossValidation <- list(eps=0,
                        maxiter=200,
                        n_of_initial_values=5000)
CrossValidation$Constant <- function(t){
  ans <- 1
  return(ans)
} # 1 variables

CrossValidation$Constant2 <- function(t){
  ans <- c(1,(t>6))
  return(ans)
} # 2 variables

CrossValidation$Constant3 <- function(t){
  ans <- c(1,(t>4),(t>7))
  return(ans)
} # 3 variables

CrossValidation$LinSpline <- function(t){
  ans <- c(1,t-1)
  return(ans)
} # 2 variables

CrossValidation$LinSpline2 <- function(t){
  ans <- c(1,t-1,(t-6)*(t>6))
  return(ans)
} # 3 variables


# Use pretreatment outcomes only
SubData <- KmeansData[KmeansData$time<=DataCleaning$T0 & 
                        KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
SubData <- as.matrix(SubData)

# drop last three time periods for cross-validation
CrossValidation$result <- matrix(0,5,4)

for (TestTime in (DataCleaning$T0-2):DataCleaning$T0){
  TrainData <- SubData[SubData[,3]<TestTime,]
  TestData <- SubData[SubData[,3]==TestTime,]
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant)
  CrossValidation$result[1,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant(TestData[i,3]))))^2
         )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant2)
  CrossValidation$result[2,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant2(TestData[i,3]))))^2
         )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$Constant3)
  CrossValidation$result[3,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$Constant3(TestData[i,3]))))^2
         )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$LinSpline)
  CrossValidation$result[4,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$LinSpline(TestData[i,3]))))^2
         )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidation$LinSpline2)
  CrossValidation$result[5,DataCleaning$T0-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidation$LinSpline2(TestData[i,3]))))^2
         )
}
remove(temp,TestTime,TrainData,TestData)
CrossValidation$result[,4] <- apply(CrossValidation$result[,1:3],1,mean)
CrossValidation$result

if (which.min(CrossValidation$result[,4])==1){
  KmeansParameter <- list(spline=CrossValidation$Constant)  
} else if (which.min(CrossValidation$result[,4])==2){
  KmeansParameter <- list(spline=CrossValidation$Constant2)
} else if (which.min(CrossValidation$result[,4])==3){
  KmeansParameter <- list(spline=CrossValidation$Constant3)
} else if (which.min(CrossValidation$result[,4])==4){
  KmeansParameter <- list(spline=CrossValidation$LinSpline)
} else if (which.min(CrossValidation$result[,4])==5){
  KmeansParameter <- list(spline=CrossValidation$LinSpline2)
}
```

### K means ###

```{r K means}
### K-means clustering using pretreatment time series
KmeansParameter$eps <- 0
KmeansParameter$maxiter <- 200
KmeansParameter$n_of_initial_values <- 5000

# Use pretreatment outcomes only
SubData <- KmeansData[KmeansData$time<=DataCleaning$T0 & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
DataCleaning$N0 <- max(SubData$id)
SubData <- as.matrix(SubData)

# Choice of K
kmeansK2 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,2,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
kmeansK3 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,3,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
kmeansK4 <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,4,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       KmeansParameter$spline)
KmeansParameter$BIC <- 
  c(kmeansK2$MSE[length(kmeansK2$MSE)] +
      (2*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)],
    kmeansK3$MSE[length(kmeansK3$MSE)] +
      (3*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)],
    kmeansK4$MSE[length(kmeansK4$MSE)] +
      (4*length(KmeansParameter$spline(1))+DataCleaning$N0+5)*
      log(DataCleaning$N0*DataCleaning$T0)/(DataCleaning$N0*DataCleaning$T0)*
      kmeansK4$MSE[length(kmeansK4$MSE)])

kmeansK2 <- extrapolate_gamma(T,kmeansK2,SubData,KmeansParameter$spline)
kmeansK3 <- extrapolate_gamma(T,kmeansK3,SubData,KmeansParameter$spline)
kmeansK4 <- extrapolate_gamma(T,kmeansK4,SubData,KmeansParameter$spline)

kmeansK2$typeTE <- retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)
kmeansK3$typeTE <- retrieve_delta(T,kmeansK3$delta,KmeansParameter$spline)
kmeansK4$typeTE <- retrieve_delta(T,kmeansK4$delta,KmeansParameter$spline)

kmeansK2$summary
kmeansK3$summary
kmeansK4$summary

kmeansK2$delta
kmeansK3$delta
kmeansK4$delta

KmeansParameter$BIC

# compare K=2, K=3 and K=4
table(cbind(kmeansK2$gamma*100+kmeansK3$gamma*10+kmeansK4$gamma))
```

### moments for simulation ###

```{r simulation moments}
### Moments for simulation
SubData <- KmeansData[KmeansData$time<=DataCleaning$T0 & KmeansData$time>0,] 
SubData$type <- kmeansK2$gamma[SubData$id]
SubData$type_time <- SubData$type*100 + SubData$time
SubData$type_time <- as.factor(SubData$type_time)

SubData$type1xsum <- (SubData$type==1)*SubData$time
SubData$type2xsum <- (SubData$type==2)*SubData$time
SubData$ccbase_time <- SubData$ccbase*SubData$time
SubData$p_white_time <- SubData$p_white*SubData$time
SubData$p_hisp_time <- SubData$p_hisp*SubData$time
SubData$n_student_time <- SubData$n_student*SubData$time
SubData$p_freelunch_time <- SubData$p_freelunch*SubData$time

model <- felm(disd ~ ccbase_time + p_white_time + p_hisp_time + 
                n_student_time + p_freelunch_time + type1xsum + type2xsum 
              | id | 0 | id,
              data=SubData, weights=SubData$weight)  
summary(model)

# unit fixed-effect
for (k in 1:max(kmeansK2$gamma)){
  print(median(apply(matrix(KmeansData[kmeansK2$gamma[KmeansData$id]==k,]$disd,
                            T+1),
                     1,mean)))
  print(median(apply(matrix(KmeansData[kmeansK2$gamma[KmeansData$id]==k,]$disd,
                            T+1),
                     1,sd)))
}

# type separation
retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)[1,2:T] - retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)[1,1:(T-1)] -
  (retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)[2,2:T] -
     retrieve_delta(T,kmeansK2$delta,KmeansParameter$spline)[2,1:(T-1)] ) 

# standard error of error
apply(matrix(abs(model$residuals),DataCleaning$T0),1,median)*1.4826
median(apply(matrix(abs(model$residuals),DataCleaning$T0),1,median))*1.4826
apply(matrix(model$residuals,DataCleaning$T0),1,sd)
median(apply(matrix(model$residuals,DataCleaning$T0),1,sd))

# serial correlation of error
SubData$res <- model$residuals
SubData$lagres <- c(0,model$residuals[1:(length(model$residuals)-1)])
SubData <- SubData[SubData$time>1,] 

mean(sapply(2:DataCleaning$T0, 
            function(t) cov(SubData$res[SubData$time==t],
                            SubData$lagres[SubData$time==t])))/
  mean(sapply(2:DataCleaning$T0, 
              function(t) var(SubData$res[SubData$time==t])))

remove(SubData,model,k)
```

### treatment timing distribution ###

```{r treatment timing distribution}
### distribution of treatment timing for each type
temp <- cbind(sapply(1:N, function(i) mean(KmeansData$E[KmeansData$id==i])),
              sapply(1:N, function(i) kmeansK2$gamma[i]))
temp <- as.data.frame(temp)
colnames(temp) <- c("E","type")
temp$type <- as.factor(temp$type)
temp_palette <- brewer.pal(n = 5, name = 'Set1')
ggplot(temp, aes(x=E, color=type, fill=type)) +
  geom_histogram(binwidth=0.5,position="dodge") +
  scale_color_manual(values=temp_palette) + 
  theme_light()

### test if the two distributions differ
temp$E2 <- temp$E==20
print(hotelling.test(E2 ~ type,data=temp))
remove(temp,temp_palette)
```

### who's who ###

```{r who is who}
temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK2$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
remove(temp)

temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK3$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
print(temp[temp$type==3,],n=N)
remove(temp)

temp <- LutzData[LutzData$time==0,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[,c(30,11,42,505)]
print(temp[temp$type==1,],n=N)
print(temp[temp$type==2,],n=N)
print(temp[temp$type==3,],n=N)
print(temp[temp$type==4,],n=N)
remove(temp)
```

### treatment effect estimation ###

```{r treatment effect}
### construct placeholder matrices for propensity score ratios
Pi <- matrix(0,N,2)
Pi[,1] <- 1
L <- matrix(0,N,1)
stackPi <- array(Pi,c(N,2,T))
stackL <- array(L,c(N,1,T))

### beta, type, r, sd, CI
pre <- 4
post <- 6

### K=2
temp <- matrix(0,(pre+post)*2,6)
# type 1 and 2
temp[,2] <- c(rep(1,pre+post), rep(2,pre+post))
temp[,3] <- c((-pre-1):-2,0:(post-1),(-pre-1):-2,0:(post-1))
for (i in 1:(2*(pre+post))){
  temp[i,1] <- estCATT(kmeansK2$gamma,
                       stackPi,
                       stackL,
                       as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$estimate
  temp[i,4] <- sd(estCATT(kmeansK2$gamma,
                          stackPi,
                          stackL,
                          as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$score)/N^(0.5)
}
temp[,5] <- temp[,1] - 1.96*temp[,4]
temp[,6] <- temp[,1] + 1.96*temp[,4]

temp <- as.data.frame(temp)
colnames(temp) <- c("beta", "type", "r", "sd", "lb", "ub")
temp$type <- as.factor(temp$type)

### graph with 95% CI
temp_palette <- brewer.pal(n = 5, name = 'Set1')
ggplot(temp, aes(x=r,y=beta,group=type)) +
  geom_line(aes(color=type),linewidth=1.5) +
  geom_point(aes(color=type), size=2) +
  geom_errorbar(aes(ymin=lb, ymax=ub,color=type), linewidth=0.5, width=.2) +
  scale_colour_manual(values=temp_palette[1:5]) +
  geom_hline(yintercept=0, color = "black") +
  theme_light()
temp

### K=3
temp <- matrix(0,(pre+post)*3,6)
temp[,2] <- c(rep(1,pre+post), rep(2,pre+post), rep(3,pre+post))
temp[,3] <- kronecker(c(1,1,1),c((-pre-1):-2,0:(post-1)))
for (i in 1:(3*(pre+post))){
  temp[i,1] <- estCATT(kmeansK3$gamma,
                       stackPi,
                       stackL,
                       as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$estimate
  temp[i,4] <- sd(estCATT(kmeansK3$gamma,
                          stackPi,
                          stackL,
                          as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$score)/N^(0.5)
}
temp[,5] <- temp[,1] - 1.96*temp[,4]
temp[,6] <- temp[,1] + 1.96*temp[,4]

temp <- as.data.frame(temp)
colnames(temp) <- c("beta", "type", "r", "sd", "lb", "ub")
temp$type <- as.factor(temp$type)

### graph with 95% CI
temp_palette <- brewer.pal(n = 5, name = 'Set1')
ggplot(temp, aes(x=r,y=beta,group=type)) +
  geom_line(aes(color=type),linewidth=1.5) +
  geom_point(aes(color=type), size=2) +
  geom_errorbar(aes(ymin=lb, ymax=ub,color=type), linewidth=0.5, width=.2) +
  scale_colour_manual(values=temp_palette[1:5]) +
  geom_hline(yintercept=0, color = "black") +
  theme_light()
temp

### K=4
temp <- matrix(0,(pre+post)*4,6)
temp[,2] <- c(rep(1,pre+post), rep(2,pre+post), rep(3,pre+post), rep(4,pre+post))
temp[,3] <- kronecker(c(1,1,1,1),c((-pre-1):-2,0:(post-1)))
for (i in 1:(4*(pre+post))){
  temp[i,1] <- estCATT(kmeansK4$gamma,
                       stackPi,
                       stackL,
                       as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$estimate
  temp[i,4] <- sd(estCATT(kmeansK4$gamma,
                          stackPi,
                          stackL,
                          as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$score)/N^(0.5)
}
temp[,5] <- temp[,1] - 1.96*temp[,4]
temp[,6] <- temp[,1] + 1.96*temp[,4]

temp <- as.data.frame(temp)
colnames(temp) <- c("beta", "type", "r", "sd", "lb", "ub")
temp$type <- as.factor(temp$type)

### graph with 95% CI
temp_palette <- brewer.pal(n = 5, name = 'Set1')
ggplot(temp, aes(x=r,y=beta,group=type)) +
  geom_line(aes(color=type),linewidth=1.5) +
  geom_point(aes(color=type), size=2) +
  geom_errorbar(aes(ymin=lb, ymax=ub,color=type), linewidth=0.5, width=.2) +
  scale_colour_manual(values=temp_palette[1:5]) +
  geom_hline(yintercept=0, color = "black") +
  theme_light()
temp
remove(i,Pi,L,stackPi,stackL,pre,post,temp,temp_palette)
```

### Callaway and Sant'anna ###
```{r Callaway and Santanna}
CSAdata <- KmeansData
CSAdata$p_white_init <- sapply(CSAdata$id,
                               function(i) CSAdata$p_white[CSAdata$time==0 & 
                                                             CSAdata$id==i]) 
CSAdata$p_hisp_init <- sapply(CSAdata$id,
                               function(i) CSAdata$p_hisp[CSAdata$time==0 & 
                                                            CSAdata$id==i]) 
CSAdata$n_student_init <- sapply(CSAdata$id,
                               function(i) CSAdata$n_student[CSAdata$time==0 & 
                                                               CSAdata$id==i]) 

CSAcatt <- att_gt(yname="disd",
                  tname="time",
                  idname="id",
                  gname="E",
                  xformla= ~ ccbase + p_white_init,
                  weightsname="weight",
                  control_group="notyettreated",
                  data=CSAdata)
CSAatt <- aggte(CSAcatt, type="dynamic", na.rm=TRUE)
CSAatt
```

### descriptive statistics ###

```{r descriptive statistics}
### K=2
### compare treated and never-treated within type
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK2$gamma[temp$id]
temp <- temp[temp$type==1,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK2$gamma[temp$id]
temp <- temp[temp$type==2,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))
remove(temp)


### compare types
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK2$gamma[temp$id]
matrix(c(mean(temp$disd[temp$type==1]), sd(temp$disd[temp$type==1]),
         mean(temp$disd[temp$type==2]), sd(temp$disd[temp$type==2]),
         mean(temp$disd[temp$type==1])-mean(temp$disd[temp$type==2]),
         (var(temp$disd[temp$type==1])/length(temp$disd[temp$type==1])+
            var(temp$disd[temp$type==2])/length(temp$disd[temp$type==2]))^(0.5)),
       2,3)
matrix(c(mean(temp$ccbase[temp$type==1]), sd(temp$ccbase[temp$type==1]),
         mean(temp$ccbase[temp$type==2]), sd(temp$ccbase[temp$type==2]),
         mean(temp$ccbase[temp$type==1])-mean(temp$ccbase[temp$type==2]),
         (var(temp$ccbase[temp$type==1])/length(temp$ccbase[temp$type==1])+
            var(temp$ccbase[temp$type==2])/length(temp$ccbase[temp$type==2]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$type==1]), sd(temp$p_white[temp$type==1]),
         mean(temp$p_white[temp$type==2]), sd(temp$p_white[temp$type==2]),
         mean(temp$p_white[temp$type==1])-mean(temp$p_white[temp$type==2]),
         (var(temp$p_white[temp$type==1])/length(temp$p_white[temp$type==1])+
            var(temp$p_white[temp$type==2])/length(temp$p_white[temp$type==2]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$type==1]), sd(temp$p_hisp[temp$type==1]),
         mean(temp$p_hisp[temp$type==2]), sd(temp$p_hisp[temp$type==2]),
         mean(temp$p_hisp[temp$type==1])-mean(temp$p_hisp[temp$type==2]),
         (var(temp$p_hisp[temp$type==1])/length(temp$p_hisp[temp$type==1])+
            var(temp$p_hisp[temp$type==2])/length(temp$p_hisp[temp$type==2]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$type==1]), sd(temp$p_freelunch[temp$type==1]),
         mean(temp$p_freelunch[temp$type==2]), sd(temp$p_freelunch[temp$type==2]),
         mean(temp$p_freelunch[temp$type==1])-mean(temp$p_freelunch[temp$type==2]),
         (var(temp$p_freelunch[temp$type==1])/length(temp$p_freelunch[temp$type==1])+
            var(temp$p_freelunch[temp$type==2])/length(temp$p_freelunch[temp$type==2]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$type==1]), sd(temp$n_student[temp$type==1]),
         mean(temp$n_student[temp$type==2]), sd(temp$n_student[temp$type==2]),
         mean(temp$n_student[temp$type==1])-mean(temp$n_student[temp$type==2]),
         (var(temp$n_student[temp$type==1])/length(temp$n_student[temp$type==1])+
            var(temp$n_student[temp$type==2])/length(temp$n_student[temp$type==2]))^(0.5)),
       2,3)
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,2)))



### K=3
### compare treated and never-treated within type
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK3$gamma[temp$id]
temp <- temp[temp$type==1,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK3$gamma[temp$id]
temp <- temp[temp$type==2,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK3$gamma[temp$id]
temp <- temp[temp$type==3,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

### compare types
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK3$gamma[temp$id]
matrix(c(mean(temp$disd[temp$type==1]), sd(temp$disd[temp$type==1]),
         mean(temp$disd[temp$type==2]), sd(temp$disd[temp$type==2]),
         mean(temp$disd[temp$type==3]), sd(temp$disd[temp$type==3]),
         mean(temp$disd[temp$type==1])-mean(temp$disd[temp$type==2]),
         (var(temp$disd[temp$type==1])/length(temp$disd[temp$type==1])+
            var(temp$disd[temp$type==2])/length(temp$disd[temp$type==2]))^(0.5)),
       2,4)
matrix(c(mean(temp$ccbase[temp$type==1]), sd(temp$ccbase[temp$type==1]),
         mean(temp$ccbase[temp$type==2]), sd(temp$ccbase[temp$type==2]),
         mean(temp$ccbase[temp$type==3]), sd(temp$ccbase[temp$type==3]),
         mean(temp$ccbase[temp$type==1])-mean(temp$ccbase[temp$type==2]),
         (var(temp$ccbase[temp$type==1])/length(temp$ccbase[temp$type==1])+
            var(temp$ccbase[temp$type==2])/length(temp$ccbase[temp$type==2]))^(0.5)),
       2,4)
matrix(c(mean(temp$p_white[temp$type==1]), sd(temp$p_white[temp$type==1]),
         mean(temp$p_white[temp$type==2]), sd(temp$p_white[temp$type==2]),
         mean(temp$p_white[temp$type==3]), sd(temp$p_white[temp$type==3]),
         mean(temp$p_white[temp$type==1])-mean(temp$p_white[temp$type==2]),
         (var(temp$p_white[temp$type==1])/length(temp$p_white[temp$type==1])+
            var(temp$p_white[temp$type==2])/length(temp$p_white[temp$type==2]))^(0.5)),
       2,4)
matrix(c(mean(temp$p_hisp[temp$type==1]), sd(temp$p_hisp[temp$type==1]),
         mean(temp$p_hisp[temp$type==2]), sd(temp$p_hisp[temp$type==2]),
         mean(temp$p_hisp[temp$type==3]), sd(temp$p_hisp[temp$type==3]),
         mean(temp$p_hisp[temp$type==1])-mean(temp$p_hisp[temp$type==2]),
         (var(temp$p_hisp[temp$type==1])/length(temp$p_hisp[temp$type==1])+
            var(temp$p_hisp[temp$type==2])/length(temp$p_hisp[temp$type==2]))^(0.5)),
       2,4)
matrix(c(mean(temp$p_freelunch[temp$type==1]), sd(temp$p_freelunch[temp$type==1]),
         mean(temp$p_freelunch[temp$type==2]), sd(temp$p_freelunch[temp$type==2]),
         mean(temp$p_freelunch[temp$type==3]), sd(temp$p_freelunch[temp$type==3]),
         mean(temp$p_freelunch[temp$type==1])-mean(temp$p_freelunch[temp$type==2]),
         (var(temp$p_freelunch[temp$type==1])/length(temp$p_freelunch[temp$type==1])+
            var(temp$p_freelunch[temp$type==2])/length(temp$p_freelunch[temp$type==2]))^(0.5)),
       2,4)
matrix(c(mean(temp$n_student[temp$type==1]), sd(temp$n_student[temp$type==1]),
         mean(temp$n_student[temp$type==2]), sd(temp$n_student[temp$type==2]),
         mean(temp$n_student[temp$type==3]), sd(temp$n_student[temp$type==3]),
         mean(temp$n_student[temp$type==1])-mean(temp$n_student[temp$type==2]),
         (var(temp$n_student[temp$type==1])/length(temp$n_student[temp$type==1])+
            var(temp$n_student[temp$type==2])/length(temp$n_student[temp$type==2]))^(0.5)),
       2,4)
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,2)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,3)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(2,3)))


### K=4
### compare treated and never-treated within type
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[temp$type==1,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[temp$type==2,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[temp$type==3,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)
print(hotelling.test(ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ treated,
                     data=temp))

temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK4$gamma[temp$id]
temp <- temp[temp$type==4,]
temp$treated <- as.factor(temp$E <= T)
matrix(c(mean(temp$ccbase[temp$treated==TRUE]), sd(temp$ccbase[temp$treated==TRUE]),
         mean(temp$ccbase[temp$treated==FALSE]), sd(temp$ccbase[temp$treated==FALSE]),
         mean(temp$ccbase[temp$treated==TRUE])-mean(temp$ccbase[temp$treated==FALSE]),
         (var(temp$ccbase[temp$treated==TRUE])/length(temp$ccbase[temp$treated==TRUE])+
            var(temp$ccbase[temp$treated==FALSE])/length(temp$ccbase[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_white[temp$treated==TRUE]), sd(temp$p_white[temp$treated==TRUE]),
         mean(temp$p_white[temp$treated==FALSE]), sd(temp$p_white[temp$treated==FALSE]),
         mean(temp$p_white[temp$treated==TRUE])-mean(temp$p_white[temp$treated==FALSE]),
         (var(temp$p_white[temp$treated==TRUE])/length(temp$p_white[temp$treated==TRUE])+
            var(temp$p_white[temp$treated==FALSE])/length(temp$p_white[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_hisp[temp$treated==TRUE]), sd(temp$p_hisp[temp$treated==TRUE]),
         mean(temp$p_hisp[temp$treated==FALSE]), sd(temp$p_hisp[temp$treated==FALSE]),
         mean(temp$p_hisp[temp$treated==TRUE])-mean(temp$p_hisp[temp$treated==FALSE]),
         (var(temp$p_hisp[temp$treated==TRUE])/length(temp$p_hisp[temp$treated==TRUE])+
            var(temp$p_hisp[temp$treated==FALSE])/length(temp$p_hisp[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$p_freelunch[temp$treated==TRUE]), sd(temp$p_freelunch[temp$treated==TRUE]),
         mean(temp$p_freelunch[temp$treated==FALSE]), sd(temp$p_freelunch[temp$treated==FALSE]),
         mean(temp$p_freelunch[temp$treated==TRUE])-mean(temp$p_freelunch[temp$treated==FALSE]),
         (var(temp$p_freelunch[temp$treated==TRUE])/length(temp$p_freelunch[temp$treated==TRUE])+
            var(temp$p_freelunch[temp$treated==FALSE])/length(temp$p_freelunch[temp$treated==FALSE]))^(0.5)),
       2,3)
matrix(c(mean(temp$n_student[temp$treated==TRUE]), sd(temp$n_student[temp$treated==TRUE]),
         mean(temp$n_student[temp$treated==FALSE]), sd(temp$n_student[temp$treated==FALSE]),
         mean(temp$n_student[temp$treated==TRUE])-mean(temp$n_student[temp$treated==FALSE]),
         (var(temp$n_student[temp$treated==TRUE])/length(temp$n_student[temp$treated==TRUE])+
            var(temp$n_student[temp$treated==FALSE])/length(temp$n_student[temp$treated==FALSE]))^(0.5)),
       2,3)

### compare types
temp <- LutzData[KmeansData$time==1,]
temp$type <- kmeansK4$gamma[temp$id]
matrix(c(mean(temp$disd[temp$type==1]), sd(temp$disd[temp$type==1]),
         mean(temp$disd[temp$type==2]), sd(temp$disd[temp$type==2]),
         mean(temp$disd[temp$type==3]), sd(temp$disd[temp$type==3]),
         mean(temp$disd[temp$type==4]), sd(temp$disd[temp$type==4])),
       2,4)
matrix(c(mean(temp$ccbase[temp$type==1]), sd(temp$ccbase[temp$type==1]),
         mean(temp$ccbase[temp$type==2]), sd(temp$ccbase[temp$type==2]),
         mean(temp$ccbase[temp$type==3]), sd(temp$ccbase[temp$type==3]),
         mean(temp$ccbase[temp$type==4]), sd(temp$ccbase[temp$type==4])),
       2,4)
matrix(c(mean(temp$p_white[temp$type==1]), sd(temp$p_white[temp$type==1]),
         mean(temp$p_white[temp$type==2]), sd(temp$p_white[temp$type==2]),
         mean(temp$p_white[temp$type==3]), sd(temp$p_white[temp$type==3]),
         mean(temp$p_white[temp$type==4]), sd(temp$p_white[temp$type==4])),
       2,4)
matrix(c(mean(temp$p_hisp[temp$type==1]), sd(temp$p_hisp[temp$type==1]),
         mean(temp$p_hisp[temp$type==2]), sd(temp$p_hisp[temp$type==2]),
         mean(temp$p_hisp[temp$type==3]), sd(temp$p_hisp[temp$type==3]),
         mean(temp$p_hisp[temp$type==4]), sd(temp$p_hisp[temp$type==4])),
       2,4)
matrix(c(mean(temp$p_freelunch[temp$type==1]), sd(temp$p_freelunch[temp$type==1]),
         mean(temp$p_freelunch[temp$type==2]), sd(temp$p_freelunch[temp$type==2]),
         mean(temp$p_freelunch[temp$type==3]), sd(temp$p_freelunch[temp$type==3]),
         mean(temp$p_freelunch[temp$type==4]), sd(temp$p_freelunch[temp$type==4])),
       2,4)
matrix(c(mean(temp$n_student[temp$type==1]), sd(temp$n_student[temp$type==1]),
         mean(temp$n_student[temp$type==2]), sd(temp$n_student[temp$type==2]),
         mean(temp$n_student[temp$type==3]), sd(temp$n_student[temp$type==3]),
         mean(temp$n_student[temp$type==4]), sd(temp$n_student[temp$type==4])),
       2,4)
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,2)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,3)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(1,4)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(2,3)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(2,4)))
print(hotelling.test(disd + ccbase + p_white + p_hisp + p_freelunch + n_student
                     ~ type,
                     data=temp,pair=c(3,4)))
```

### never-treated units only ###

```{r never-treated units only}
### K-means clustering using entire time series
CrossValidationNT <- list(eps=0,
                          maxiter=200,
                          n_of_initial_values=5000)

CrossValidationNT$Constant <- function(t){
  ans <- 1
  return(ans)
} # 1 variables

CrossValidationNT$Constant2 <- function(t){
  ans <- c(1,(t>10))
  return(ans)
} # 2 variables

CrossValidationNT$Constant3 <- function(t){
  ans <- c(1,(t>7),(t>13))
  return(ans)
} # 3 variables

CrossValidationNT$LinSpline <- function(t){
  ans <- c(1,t-1)
  return(ans)
} # 2 variables

CrossValidationNT$LinSpline2 <- function(t){
  ans <- c(1,t-1,(t-10)*(t>10))
  return(ans)
} # 3 variables

# use never-treated units only
SubData <- KmeansData[KmeansData$E>T & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
temp <- unique(SubData$id)
SubData$id <- sapply(SubData$id,function(x) which(x==temp))
remove(temp)
SubData <- as.matrix(SubData)

# drop last three time periods for cross-validation
CrossValidationNT$result <- matrix(0,5,4)

for (TestTime in (T-2):T){
  TrainData <- SubData[SubData[,3]<TestTime,]
  TestData <- SubData[SubData[,3]==TestTime,]
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant)
  CrossValidationNT$result[1,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant2)
  CrossValidationNT$result[2,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant2(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$Constant3)
  CrossValidationNT$result[3,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$Constant3(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$LinSpline)
  CrossValidationNT$result[4,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$LinSpline(TestData[i,3]))))^2
    )
  
  temp <- kmeans_fun(CrossValidation$n_of_initial_values,TrainData,2,
                     CrossValidation$eps,CrossValidation$maxiter,
                     CrossValidationNT$LinSpline2)
  CrossValidationNT$result[5,T-TestTime+1] <- 
    mean((TestData[,1] - TestData[,6:ncol(TestData)]%*%temp$theta - 
            sapply(1:nrow(TestData),
                   function(i) sum(temp$delta[temp$gamma[TestData[i,2]],]*
                                     CrossValidationNT$LinSpline2(TestData[i,3]))))^2
    )
}
remove(temp,TestTime,TrainData,TestData)
CrossValidationNT$result[,4] <- apply(CrossValidationNT$result[,1:3],1,mean)
CrossValidationNT$result

if (which.min(CrossValidationNT$result[,4])==1){
  CrossValidationNT$spline <- CrossValidationNT$Constant
} else if (which.min(CrossValidationNT$result[,4])==2){
  CrossValidationNT$spline <- CrossValidationNT$Constant2
} else if (which.min(CrossValidationNT$result[,4])==3){
  CrossValidationNT$spline <- CrossValidationNT$Constant3
} else if (which.min(CrossValidationNT$result[,4])==4){
  CrossValidationNT$spline <- CrossValidationNT$LinSpline
} else if (which.min(CrossValidationNT$result[,4])==5){
  CrossValidationNT$spline <- CrossValidationNT$LinSpline2
}


# Type classification by extrapolation
SubData <- KmeansData[KmeansData$E>T & KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
temp <- unique(SubData$id)
SubData$id <- sapply(SubData$id,function(x) which(x==temp))
remove(temp)
SubData <- as.matrix(SubData)
kmeansNT <- kmeans_fun(KmeansParameter$n_of_initial_values,SubData,2,
                       KmeansParameter$eps,KmeansParameter$maxiter,
                       CrossValidationNT$spline)
table(c((kmeansK2$gamma)[KmeansData$E[KmeansData$time==1]>T]*10+kmeansNT$gamma))

SubData <- KmeansData[KmeansData$time>0,
                      c(ncol(KmeansData),2:(ncol(KmeansData)-1))] 
SubData <- as.matrix(SubData)
kmeansNT <- extrapolate_gamma(T,kmeansNT,SubData,CrossValidationNT$spline)
kmeansNT$typeTE <- retrieve_delta(T,kmeansNT$delta,CrossValidationNT$spline)
table(c(kmeansK2$gamma*10+kmeansNT$gamma))
table(c(kmeansK2$gamma*10+kmeansNT$gamma)[SubData[SubData[,3]==1,4]>T])
remove(SubData)

# CATT estimation
Pi <- matrix(0,N,2)
Pi[,1] <- 1
L <- matrix(0,N,1)
stackPi <- array(Pi,c(N,2,T))
stackL <- array(L,c(N,1,T))
pre <- 4
post <- 6

temp <- matrix(0,(pre+post)*2,6)
# type 1 and 2
temp[,2] <- c(rep(1,pre+post), rep(2,pre+post))
temp[,3] <- c((-pre-1):-2,0:(post-1),(-pre-1):-2,0:(post-1))
for (i in 1:(2*(pre+post))){
  temp[i,1] <- estCATT(kmeansNT$gamma,
                       stackPi,
                       stackL,
                       as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$estimate
  temp[i,4] <- sd(estCATT(kmeansNT$gamma,
                          stackPi,
                          stackL,
                          as.matrix(KmeansData[,1:5]),temp[i,3],temp[i,2])$score)/N^(0.5)
}
temp[,5] <- temp[,1] - 1.96*temp[,4]
temp[,6] <- temp[,1] + 1.96*temp[,4]

temp <- as.data.frame(temp)
colnames(temp) <- c("beta", "type", "r", "sd", "lb", "ub")
temp$type <- as.factor(temp$type)

### graph with 95% CI
temp_palette <- brewer.pal(n = 5, name = 'Set1')
ggplot(temp, aes(x=r,y=beta,group=type)) +
  geom_line(aes(color=type),linewidth=1.5) +
  geom_point(aes(color=type), size=2) +
  geom_errorbar(aes(ymin=lb, ymax=ub,color=type), linewidth=0.5, width=.2) +
  scale_colour_manual(values=temp_palette[1:5]) +
  geom_hline(yintercept=0, color = "black") +
  theme_light()
temp
remove(i,Pi,L,stackPi,stackL,pre,post,temp,temp_palette)
```

